# Authors: D.S. Hansen & Gijs Schumacher
# Email: g.schumacher@uva.nl
# Replication code paper "A New Dataset of Dutch and Danish Party Congress Speeches"

# Last updated: February 2019
# R version 3.5.2 (2018-12-20)


# Basic setup -------------------------------------------------------------
rm(list=ls())
library(quanteda) # quanteda_1.4.0
library(stringr) # stringr_1.4.0
library(openxlsx) # openxlsx_4.1.0
sessionInfo() # Check which versions you have installed

# Load Speech corpus from Dataverse. Script based on V2. 
load("corpus_DK_NL.Rdata")
text  <- corpus_subset(corpus,country=="NL")

# Load sentiment data. NRC Emotion Lexicon (v.092, downloaded from http://sentiment.nrc.ca/lexicons-for-research/)
sentiment.url <- "http://sentiment.nrc.ca/lexicons-for-research/NRC-Emotion-Lexicon.zip"
temp <- tempfile()
download.file(sentiment.url, temp)
unzip(temp)   
sentiment <- read.xlsx("NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-v0.92-In105Languages-Nov2017Translations.xlsx")
sentiment <- sentiment[,c(21,106,107)]

# in case the NRC Emotion Lexicon is no longer available online
#load("sentiment.nl.RData")
#sentiment <- sentiment.nl

# Do sentiment analysis ---------------------------------------------------
dictionary <- dictionary(list(positive = sentiment[which(sentiment[,2]==1),1],
                              negative = sentiment[which(sentiment[,3]==1),1]))

# Making dfm with speech data with positive and negative words
dfm <- dfm(text) 
dfm.sentiment <- dfm(dfm, dictionary=dictionary)

# Total scores
positive <- as.matrix((dfm.sentiment[,1] / ntoken(dfm))*100)
negative <- as.matrix((dfm.sentiment[,2] / ntoken(dfm))*100)
arousal <- as.matrix(positive+negative)
polarity <- as.matrix((positive - negative) / arousal)

dutch.speeches.analysis <-  data.frame(docvars(text), positive, negative, polarity, arousal)
colnames(dutch.speeches.analysis)[92:93] <- c("polarity", "arousal")


# Create some additional variables --------------------------------------
decade <- substr(dutch.speeches.analysis$year,3,3)
temp <- ifelse(substr(dutch.speeches.analysis$year,4,4)<5,0,1)
dutch.speeches.analysis$fiveyear <- paste0(decade,ifelse(temp==0,paste0("0-",decade,"4"),
                                                         paste0("5-",decade,"9")))
dutch.speeches.analysis$fiveyear2 <- paste0(decade,ifelse(temp==0,paste0("0"),paste0("5")))
dutch.speeches.analysis$fiveyear <- factor(dutch.speeches.analysis$fiveyear, levels=unique(dutch.speeches.analysis$fiveyear))
dutch.speeches.analysis$fiveyear2 <- factor(dutch.speeches.analysis$fiveyear2, levels=unique(dutch.speeches.analysis$fiveyear2))
dutch.speeches.analysis$party2 <- ifelse(dutch.speeches.analysis$party%in%c("ARP", "CHU", "KVP"), "CDA", dutch.speeches.analysis$party)


# Write out ---------------------------------------------------------------
#setwd("C:/Users/gschuma1/surfdrive/Papers/Published Work/Congress speeches data paper/Data & Analysis/")
save("dutch.speeches.analysis", file = "dutch.speeches.analysis.Rdata")





